# -*- coding: utf-8 -*-
#
# getting hot comments and latest comments for SINA News
#
# Mac, PostgreSQL 9.4, python 2.7
#
#
#
__author__ = 'qiidee'

import urllib
import urllib2
import urlparse
import json
import psycopg2
import time
import socket
import httplib
from random import randint
from random import uniform
import datetime
from urllib2 import Request, urlopen, URLError, HTTPError
from dateutil.parser import parse



request_headers = {
"Accept-Language": "en-US,en;q=0.5",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Connection": "keep-alive"
}

def _decode_list(data):
    rv = []
    for item in data:
        if isinstance(item, unicode):
            item = item.encode('utf-8')
        elif isinstance(item, list):
            item = _decode_list(item)
        elif isinstance(item, dict):
            item = _decode_dict(item)
        rv.append(item)
    return rv

def _decode_dict(data):
    rv = {}
    for key, value in data.iteritems():
        if isinstance(key, unicode):
            key = key.encode('utf-8')
        if isinstance(value, unicode):
            value = value.encode('utf-8')
        elif isinstance(value, list):
            value = _decode_list(value)
        elif isinstance(value, dict):
            value = _decode_dict(value)
        rv[key] = value
    return rv

def _retrieve_news(table, newsid):
    '''
    retrieve record from database for checking duplicates
    '''
    conn_string = "host='localhost' dbname='acdb' user='postgres' password='postgres'"
    # print the connection string we will use to connect
    #print "Retrieve records: connecting to database\n    ->%s" % (conn_string)

    # get a connection, if a connect cannot be made an exception will be raised here
    conn = psycopg2.connect(conn_string)

    # conn.cursor will return a cursor object, you can use this cursor to perform queries
    cursor = conn.cursor()

    select_sql = "SELECT * FROM %s WHERE newsid = \'%s\' " % (table, newsid)
    cursor.execute(select_sql)

    # retrieve the records from the database
    records = cursor.fetchall()

    return records

def _retrieve_comment(table, mid, newsid):
    '''
    retrieve record from database for checking duplicates
    '''
    conn_string = "host='localhost' dbname='acdb' user='postgres' password='postgres'"
    # print the connection string we will use to connect
    #print "Retrieve records: connecting to database\n    ->%s" % (conn_string)

    # get a connection, if a connect cannot be made an exception will be raised here
    conn = psycopg2.connect(conn_string)

    # conn.cursor will return a cursor object, you can use this cursor to perform queries
    cursor = conn.cursor()

    select_sql = "SELECT * FROM %s WHERE mid = \'%s\' and newsid = \'%s\'" % (table, mid, newsid)
    cursor.execute(select_sql)

    # retrieve the records from the database
    records = cursor.fetchall()

    return records


def _update_comment(table, mid, newsid, st):
    '''
    retrieve record from database for checking duplicates
    '''
    conn_string = "host='localhost' dbname='acdb' user='postgres' password='postgres'"
    # print the connection string we will use to connect
    #print "Retrieve records: connecting to database\n    ->%s" % (conn_string)

    # get a connection, if a connect cannot be made an exception will be raised here
    conn = psycopg2.connect(conn_string)

    # conn.cursor will return a cursor object, you can use this cursor to perform queries
    cursor = conn.cursor()

    update_sql = "UPDATE %s SET access_time2 = \'%s\' WHERE mid = \'%s\' and newsid = \'%s\'" % (table, st, mid, newsid)
    cursor.execute(update_sql)
    conn.commit()

def _retrieve_num_comment(newsid):
    '''
    retrieve number of collected record from database for checking duplicates
    '''
    conn_string = "host='localhost' dbname='acdb' user='postgres' password='postgres'"
    # print the connection string we will use to connect
    #print "Retrieve records: connecting to database\n    ->%s" % (conn_string)

    # get a connection, if a connect cannot be made an exception will be raised here
    conn = psycopg2.connect(conn_string)

    # conn.cursor will return a cursor object, you can use this cursor to perform queries
    cursor = conn.cursor()

    select_sql = "SELECT count(*) FROM comments3 WHERE newsid = \'%s\' " % (newsid)
    cursor.execute(select_sql)

    # retrieve the records from the database
    records = cursor.fetchall()

    return records

def _retrieve_collect_comment(newsid):
    '''
    retrieve number of record that can be collected
    '''
    conn_string = "host='localhost' dbname='acdb' user='postgres' password='postgres'"
    # print the connection string we will use to connect
    #print "Retrieve records: connecting to database\n    ->%s" % (conn_string)

    # get a connection, if a connect cannot be made an exception will be raised here
    conn = psycopg2.connect(conn_string)

    # conn.cursor will return a cursor object, you can use this cursor to perform queries
    cursor = conn.cursor()

    select_sql = "SELECT show from comment_page_info3 WHERE newsid = \'%s\' " % (newsid)
    cursor.execute(select_sql)

    # retrieve the records from the database
    records = cursor.fetchall()

    return records


def _insert_record(sql):

    conn_string = "host='localhost' dbname='acdb' user='postgres' password='postgres'"
    # print the connection string we will use to connect
    #print "Insert records: connecting to database to insert \n    ->%s" % (conn_string)

    # get a connection, if a connect cannot be made an exception will be raised here
    conn = psycopg2.connect(conn_string)

    # conn.cursor will return a cursor object, you can use this cursor to perform queries
    cursor = conn.cursor()

    cursor.execute(sql)
    conn.commit()

def _retrieve_link(table, date):
    '''
    retrieve record from database for checking duplicates
    '''
    conn_string = "host='localhost' dbname='acdb' user='postgres' password='postgres'"
    # print the connection string we will use to connect
    #print "Retrieve records: connecting to database\n    ->%s" % (conn_string)

    # get a connection, if a connect cannot be made an exception will be raised here
    conn = psycopg2.connect(conn_string)

    # conn.cursor will return a cursor object, you can use this cursor to perform queries
    cursor = conn.cursor()

    # select_sql = "SELECT distinct(comment_url) FROM %s WHERE top_time = \'%s\'" % (table, date)
    select_sql = "SELECT distinct(comment_url) FROM %s WHERE create_date = \'%s\' and commentaccesstime is null" % (table, date)
    cursor.execute(select_sql)

    # retrieve the records from the database
    records = cursor.fetchall()

    return records

def _update_table_newsid(table, link, accesstime, newsid):
    '''
    update record with newsid
    '''
    conn_string = "host='localhost' dbname='acdb' user='postgres' password='postgres'"
    # print the connection string we will use to connect
    #print "Retrieve records: connecting to database\n    ->%s" % (conn_string)

    # get a connection, if a connect cannot be made an exception will be raised here
    conn = psycopg2.connect(conn_string)

    # conn.cursor will return a cursor object, you can use this cursor to perform queries
    cursor = conn.cursor()

    select_sql = "update %s set newsid = \'%s\', commentaccesstime = \'%s\' WHERE comment_url = \'%s\' " % (table, newsid, accesstime, link)
    cursor.execute(select_sql)

    # retrieve the records from the database
    conn.commit()

# def patch_http_response_read(func):
#     '''
#     Patch incomplete read
#     '''
#     def inner(*args):
#         try:
#             return func(*args)
#         except httplib.IncompleteRead, e:
#             return e.partial
#
#     return inner

def _form_sql(*args):
    # print args
    # print type(args)
    # print len(args)
    data = args[0]
    # print type(data)

    if data.has_key('default'):
        data['kefault'] = data['default']
        del data['default']
    if data.has_key('column'):
        data['kolumn'] = data['column']
        del data['column']
    keys = data.keys()
    keystring = ''
    keystring = ', '.join(keys)
    #print keystring
    valuestring = ''
    for key in keys:
        value = data[key]
        if isinstance(value, basestring):
            value = value.replace('\'', '_')
            value = value.replace('\"', '__')

            if (key == 'area' or key == 'content'
                or key == 'mid' or key == 'newsid'
                or key == 'time' or key == 'uid'
                or key == 'status' or key == 'title'
                or key == 'url' or key == 'countmode'
                or key == 'parent' or key == 'ip'
                or key == 'thread' or key == 'usertype'
                or key == 'usertype' or key == 'nick'
                or key == 'channel' or key == 'config'
                or key == 'kolumn' or key == 'script'
                or key == 'anonymous' or key == 'kefault'
                or key == 'show' or key == 'tagalias'
                or key == 'topic' or key == 'groups'
                or key == 'rank' or key == 'vote'
                or key == 'access_time' or key == 'newsurl'
                or key == 'subnum' or key == 'base'
                or key == 'level'):
                if len(value) == 0:
                    value = ' '
                valuestring = valuestring + '\'' + value + '\'' + ','
            if (key == 'against' or key == 'length'
                or key == 'agree'):
                valuestring = valuestring + str(value) + ','
        # elif key == 'content':
        #     valuestring = valuestring + '\'' + value + '\'' + ','
        else:
            valuestring = valuestring + str(value) + ','
    valuestring = valuestring[:-1]
    return keystring,valuestring


def get_latest_comments(latestlink, newsid):
    '''
    GET COMMENTS that are available
    '''

    # server is slow to pickup the connection.
    #time.sleep(randint(1,3))
    # print "getting latest comments..."
    # print "latest link is %s" % latestlink
    connect_timeout = 5
    try:
        req = urllib2.Request(latestlink, headers=request_headers)
        handler = urllib2.urlopen(req, timeout = 10.0)
        html = handler.read()
        # r = requests.get(url=latestlink,timeout=10.0)
    except:
        # print "Too slow Mojo!"
        time.sleep(randint(4,8))
        try:
            req = urllib2.Request(latestlink, headers=request_headers)
            handler = urllib2.urlopen(req, timeout = 10.0)
            html = handler.read()
        except:
            # print "Rest too short!"
            time.sleep(randint(5,10))
            try:
                req = urllib2.Request(latestlink, headers=request_headers)
                handler = urllib2.urlopen(req, timeout = 10.0)
                html = handler.read()
            except:
                print "Server must be down!"
                raise

    # print query
    data = json.loads(html, object_hook=_decode_dict)
    responsecode = data['result']['status']['code']
    if responsecode == 0:
        cmnlist = data['result']['cmntlist']
    else:
        # print responsecode
        print data['result']['status']['msg']
        time.sleep(randint(5,10))
        try:
            req = urllib2.Request(latestlink, headers=request_headers)
            handler = urllib2.urlopen(req, timeout = 10.0)
            html = handler.read()
        except:
            # print responsecode
            # print latestlink
            try:
                req = urllib2.Request(latestlink, headers=request_headers)
                handler = urllib2.urlopen(req, timeout = 10.0)
                html = handler.read()
            except:
                print "unable to get the page at this moment!"
                raise
        # page = r.content
        data = json.loads(html, object_hook=_decode_dict)
        responsecode = data['result']['status']['code']
        if responsecode == 0:
            cmnlist = data['result']['cmntlist']
        else:
            print responsecode
            print latestlink
            raise
    # get access time
    ts = time.time()
    st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')

    for n in range(len(cmnlist)):
        # print cmnlist[n]['content']
        comment = cmnlist[n]
        comment['access_time'] = st
        comment['newsid'] = newsid
        mid = comment['mid']
        table = 'comments3'
        keystring, valuestring = _form_sql(comment)
        insert_sql = 'INSERT INTO ' + table + ' (' + keystring + ') VALUES (' + valuestring + ')'
        # insert_sql = 'INSERT INTO comments (' + keystring + ') VALUES (' + valuestring + ')'
        # print insert_sql


        # # INSERT RECORD INTO DATABASE
        record = _retrieve_comment(table, mid, newsid)
        if len(record) > 0:
            _update_comment(table, mid, newsid, st)
            #print "duplicate comment!"
            # print comment
        else:
            try:
                _insert_record(insert_sql)
                #print "insert comment!"
            except:
                print insert_sql
                raise

def get_link(commentlink, newsid):
    '''
    Get the basic information and hot comments
    And call a loop to get the latest comments
    '''
    # check if it is in the table
    table = 'comment_page_info3'
    # print insert_sql
    record = _retrieve_news(table, newsid)
    nrecord = len(record)
    print nrecord
    nrecord = 0
    if nrecord > 0:
        print "duplicate page count!"
        # check the latest news...return if necesary
        commentnum = _retrieve_num_comment(newsid)
        partialnumcomment = commentnum[0][0]
        totalnumcomment = _retrieve_collect_comment(newsid)
        totalcomment = totalnumcomment[0][0]
        # all comments have collected
        if partialnumcomment == totalcomment:
            return
        # collceted more
        else:
            startpage = (totalcomment - partialnumcomment)/50
            print totalcomment, partialnumcomment
            for num in range(startpage,(totalcomment/50 + 1)):
                latestlink = commentlink[:-1] + str(num) + '&page_size=50'
                get_latest_comments(latestlink, newsid)

    # the record is not in the table, start collecting....
    else:
        # server is slow to pickup the connection.
        connect_timeout = 5
        try:
            req = urllib2.Request(commentlink, headers=request_headers)
            handler = urllib2.urlopen(req, timeout = 10.0)
            html = handler.read()
        except:
            print "Too slow Mojo!"
            time.sleep(uniform(0.5,8.5))
            try:
                req = urllib2.Request(commentlink, headers=request_headers)
                handler = urllib2.urlopen(req, timeout = 10.0)
                html = handler.read()
            except:
                print "Rest too short!"
                time.sleep(uniform(6.5,20.5))
                try:
                    req = urllib2.Request(commentlink, headers=request_headers)
                    handler = urllib2.urlopen(req, timeout = 10.0)
                    html = handler.read()
                except:
                    print "Server must be down!"
                    raise

        data = json.loads(html, object_hook=_decode_dict)
        # pp.pprint(data)
        comments = data['result']
        # print type(comments)
        numComments = len(comments)
        # print numComments
        if 'cmntlist' in comments.keys():

            cmnlist = data['result']['cmntlist']
            # print 'length of comments are %s' % len(cmnlist)
            grouplist = comments['grouplist']
            # print 'length of group comments are %s' % len(grouplist)
            hot_list = comments['hot_list']
            # print 'length of hot comments are %s' % len(hot_list)

            replydict = comments['replydict']

            page_count = comments['count']
            encoding = comments['encoding']
            wb_verified = comments['cmntlist_wb_verified']

            # print page_count, encoding, wb_verified

            # get access time
            ts = time.time()
            st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')

            # print '\n\n page count'
            page_count['access_time'] = st
            page_count['newsid'] = newsid
            totalcomment = page_count['show']
            totalpages = totalcomment/50 + 2
            keystring, valuestring = _form_sql(page_count)
            insert_sql = 'INSERT INTO ' + table + ' (' + keystring + ') VALUES (' + valuestring + ')'
            # print insert_sql
            # print "new page count"

            # # INSERT comment_page_info INTO database
            _insert_record(insert_sql)


            for n in range(len(grouplist)):

                # print '\n\n group comments'
                group_count = grouplist[n]['count']

                group_count['access_time'] = st
                group_count['newsid'] = newsid

                table = 'comment_pageinfo_hidden3'
                keystring, valuestring = _form_sql(group_count)
                insert_sql = 'INSERT INTO ' + table + ' (' + keystring + ') VALUES (' + valuestring + ')'
                # print insert_sql

                # # INSERT hidden page information INTO DATABASE

                record = _retrieve_news(table, newsid)
                if len(record) > 0:
                    print "duplicate page hidden info!"
                else:
                    _insert_record(insert_sql)


                # print "\n\n metadata"
                groupnews = grouplist[n]['news']

                groupnews['access_time'] = st
                groupnews['newsid'] = newsid

                table = 'comment_metadata3'
                keystring, valuestring = _form_sql(groupnews)
                insert_sql = 'INSERT INTO ' + table + ' (' + keystring + ') VALUES (' + valuestring + ')'
                # insert_sql = 'INSERT INTO comment_metadata (' + keystring + ') VALUES (' + valuestring + ')'
                print insert_sql
                # newsurl = groupnews['newsurl']

                # # INSERT comment metadata INTO DATABASE
                record = _retrieve_news(table, newsid)
                if len(record) > 0:
                    # print "duplicate metadata!"
                    pass
                else:
                    _insert_record(insert_sql)

            # print '\n\n hot comments'
            # print(hot_list)
            for n in range(len(hot_list)):
                # print hot_list[n]['content']
                # print hot_list[n]['agree']
                # print hot_list[n]['area']
                hotcomment = hot_list[n]
                hotcomment['access_time'] = st
                hotcomment['newsid'] = newsid
                table = 'hotcomment3'
                mid = hotcomment['mid']
                keystring, valuestring = _form_sql(hotcomment)
                insert_sql = 'INSERT INTO ' + table + ' (' + keystring + ') VALUES (' + valuestring + ')'
                # insert_sql = 'INSERT INTO hotcomment (' + keystring + ') VALUES (' + valuestring + ')'
                # print insert_sql

                # # INSERT hot comments INTO DATABASE

                record = _retrieve_comment(table, mid, newsid)
                if len(record) > 0:
                    # print "duplicate hot comments!"
                    _update_comment(table, mid, newsid, st)
                    # print hotcomment
                else:
                    try:
                        _insert_record(insert_sql)
                    except:
                        print insert_sql
                        raise

            # print page_count, encoding, wb_verified

            print '\n\n page count'
            page_count['access_time'] = st
            page_count['newsid'] = newsid
            totalcomment = page_count['show']
            totalpages = totalcomment/50 + 2
            print("total pages are %s" % totalpages)

            # get latest comments -- take too long to complete
            print '\n\n latest comments'

            # for num in range(1, totalpages):
            for num in range(1, totalpages):
                latestlink = commentlink[:-1] + str(num) + '&page_size=50'
                try:
                    get_latest_comments(latestlink, newsid)
                except:
                    time.sleep(uniform(0.5,2.5))
                    try:
                        get_latest_comments(latestlink, newsid)
                    except:
                        print "fail to get comments"
                        print latestlink




if __name__ == '__main__':
    #newsid: comment news story id
    #channel: unclear
    # page: visible page?  default = 1
    # page_size: number of comments?  default = 20
    # jsvar: unclear javascipt object? default = requestId_97756450

    socket.setdefaulttimeout(20)
    table = 'sinaclick'
    startDate = datetime.date(2015,8,15)
    watchPeriod = 366
    for day in range(watchPeriod):
        newsDate = startDate + datetime.timedelta(day)
        newsDate = newsDate.strftime('%Y%m%d')
        print "\n\n\n now working on..."
        print newsDate
        print "\n\n\n"
        records = _retrieve_link(table, newsDate)
        for record in records:
            link = record[0]
            parameters = link.split('newsid=')
            # print parameters
            newsid = parameters[1]
            newsid = newsid.split('&')[0]
            head = link.split('news.sina.com.cn')
            #print newsid
            # newsid = 'newsid=1-1-18227098'
            # urlhead = 'http://comment5.news.sina.com.cn/page/info?&channel=gn&' + newsid + '&page=1'
            urlhead = head[0] + 'news.sina.com.cn' + '/page/info?&channel=gn&newsid=' + newsid + '&page=1'
            # print urlhead
            # get comments/shares
            try:
                get_link(urlhead, newsid)

                # get access time
                ts = time.time()
                st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
                _update_table_newsid(table, link, st, newsid)
            except:
                time.sleep(600)
            break

    print "it's done!"

